Ryan Tibshirani, November 16, 2020
We first fetch state-level JHU CSSE confirmed cases and deaths from Delphi’s COVIDcast API. Here and throughout we look at the 7-day smoothed versions of these signals (via 7-day trailing averages).
start_day = "2020-03-01"
suppressMessages({
cases = covidcast_signal(data_source = "jhu-csse",
signal = "confirmed_7dav_incidence_num",
start_day = start_day, geo_type = "state")
deaths = covidcast_signal(data_source = "jhu-csse",
signal = "deaths_7dav_incidence_num",
start_day = start_day, geo_type = "state")
})
Now we compute correlations for between national cases and deaths, for a bunch of different lags for the cases signal. We restrict our attention to data after April 1.
compute_cor = function(x, y, lags = 0:40, start_day = NULL, end_day = NULL,
...) {
if (is.null(start_day)) start_day = Min(x$time_value)
if (is.null(end_day)) end_day = Max(x$time_value)
start_day = as.Date(start_day); end_day = as.Date(end_day)
x = x %>% filter(between(time_value, start_day, end_day))
y = y %>% filter(between(time_value, start_day, end_day))
x = map_dfr(lags, function(lag) {
covidcast_cor(x, y, dt_x = -lag, ...) %>% mutate(lag = lag)
})
attributes(x)$start_day = start_day
attributes(x)$end_day = end_day
return(x)
}
df = compute_cor(cases, deaths, start_day = "2020-04-01")
Note that we now have one correlation per state and lag value. Now for each lag, we try aggregating them (over states) in three different ways: median, mean, and population-weighted mean.
# Summarize by median, mean, population-weighted mean
df_summary = state_census %>%
select(POPESTIMATE2019, ABBR) %>%
rename("pop" = POPESTIMATE2019, "geo_value" = ABBR) %>%
mutate(geo_value = tolower(geo_value)) %>%
inner_join(df, by = "geo_value") %>%
group_by(lag) %>%
summarize(Median = Median(value),
Mean = Mean(value),
WeightedMean = Sum(value * pop) / Sum(pop)) %>%
pivot_longer(cols = c("Median", "Mean", "WeightedMean"),
names_to = "metric", values_to = "value")
# Now compute the maximizing lag for each metric
df_argmax = df_summary %>%
group_by(metric) %>%
filter(value == Max(value))
title = "Correlation between lagged cases and deaths"
subtitle = sprintf("State-level, over %s to %s",
format.Date(attributes(df)$start_day, "%B %d"),
format.Date(attributes(df)$end_day, "%B %d"))
ggplot(df_summary, aes(x = lag, y = value)) +
geom_point(aes(color = metric)) +
geom_line(aes(color = metric)) +
geom_vline(data = df_argmax,
aes(xintercept = lag, color = metric),
size = 0.75, show.legend = FALSE) +
labs(x = "Lag (days)", y = "Correlation", title = title,
subtitle = subtitle) +
theme(legend.position = "bottom", legend.title = element_blank())
The median and mean curves are quite different; suggesting that the distribution of correlations per lag (over states) is skewed, and also likely also has a big spread given the gap between the median and mean. The mean curve is maximized at 20 days, but has very little curvature around its maximum. The median curve is maximized at 16 days, but has a second local max 28 days. From this picture, it seems difficult to look at any one lag as the “right” one.
Below we plot each correlation per lag value (over states) and overlay the median and mean curves on top. Since the population-weighting didn’t really do much to affect the mean curve, we exclude it.
ggplot(df, aes(x = lag, y = value)) +
geom_point(alpha = 0.25) +
geom_line(data = df_summary %>%
filter(metric != "WeightedMean"),
aes(x = lag, y = value, color = metric),
size = 1.25) +
labs(x = "Lag (days)", y = "Correlation", title = title,
subtitle = subtitle) +
geom_vline(data = df_argmax %>%
filter(metric != "WeightedMean"),
aes(xintercept = lag, color = metric),
size = 0.75, show.legend = FALSE) +
theme(legend.position = "bottom", legend.title = element_blank())
As we guessed, there’s both a lot of asymmetry and a huge spread. So it seems pretty difficult to agree on 16 vs 20 vs 28 as the “right” lag with any amount of confidence.
Next we look at the distribution of maximizing lags per state.
# Compute the maximizing lag per state
df_argmax_by_state = df %>%
group_by(geo_value) %>%
filter(value == Max(value)) %>%
ungroup() %>%
group_by(lag) %>%
mutate(y = 1:n()) %>%
ungroup()
ggplot(df_argmax_by_state, aes(x = lag, y = y, label = geo_value)) +
geom_point(size = 3, alpha = 0.25) + geom_text() +
labs(x = "Lag (days)", y = "Count", title = "Maximizing lags by state",
subtitle = subtitle)
Several states have maximizing lags somewhere around 20, but there’s still a huge spread. Once again, it’s hard to walk away with a clear picture for the “right” lag.
We compute the case-fatality-ratio (CFR), for various lags. For each lag value \(k\), we actually shift the death signals forward in time \(k\) days, then take a ratio to current cases. Thus the interpretion of the (say) CFR using a lag of 10 estimated on June 1 is as follows: of the cases that arrive June 1, it gives the fraction that will die on June 11.
# Add national signals by summing up cases and deaths
append_nat = function(x) {
x %>%
select(data_source, signal, time_value, geo_value, value) %>%
bind_rows(x %>%
group_by(data_source, signal, time_value) %>%
summarize(geo_value = "us", value = Sum(value))) %>%
mutate(issue = time_value, lag = NA, stderr = NA, sample_size = NA)
}
cases = append_nat(cases)
deaths = append_nat(deaths)
# Aggregate cases and deaths with dt = 10:30
signals = aggregate_signals(list(cases, deaths), dt = list(0, 10:30))
# Divide each of the death signals by cases
scale_by = function(x, by, prefix = "value") {
x %>%
mutate(across(starts_with(prefix), ~ 100 * .x / !!as.symbol(by))) %>%
select(-!!as.symbol(by)) %>%
pivot_longer(cols = starts_with(prefix),
names_to = "name", values_to = "value") %>%
separate(col = "name", into = c("dt", "rest"), sep = ":") %>%
mutate(dt = as.numeric(sub("value", "", dt))) %>%
select(-rest)
}
cfr = scale_by(signals, by = "value+0:jhu-csse_confirmed_7dav_incidence_num")
First we plot the CFR nationally. We restrict our attention to data after April 1, and to lags in increments of 5 for visibility.
ggplot(cfr %>% filter(geo_value == "us",
time_value >= "2020-04-15",
dt %in% seq(10, 30, by = 5)),
aes(x = time_value, y = value)) +
geom_line(aes(color = as.factor(dt))) +
geom_hline(yintercept = 1.5, linetype = 2) +
scale_x_date(breaks = "months", date_labels = "%b") +
labs(x = "Date", y = "Case-fatality-ratio (%)",
title = "Case-fatality-ratio, US", color = "Lag")
We can see that the CFR has come down from the scary numbers we were seeing in the early stages of the pandemic and has been mostly flat since July, hovering somewhere above 1.5% (the horizontal dashed line). If we believe the shorter lags are relevant, then it looks like the like the CFR is possibly dropping in early November; but if we believe the longer lags are more relevant, then this really remains to be seen.
Next we zoom on on July 1 through current day.
ggplot(cfr %>% filter(geo_value == "us",
time_value >= "2020-07-01",
dt %in% seq(10, 30, by = 5)),
aes(x = time_value, y = value)) +
geom_line(aes(color = as.factor(dt))) +
geom_hline(yintercept = 1.5, linetype = 2) +
scale_x_date(breaks = "months", date_labels = "%b") +
labs(x = "Date", y = "Case-fatality-ratio (%)",
title = "Case-fatality-ratio, US", color = "Lag")
We do the same as in the last part, but for each state individually.
ggplot(cfr %>% filter(geo_value != "us",
time_value >= "2020-04-15",
dt %in% seq(10, 30, by = 5)),
aes(x = time_value, y = value)) +
geom_line(aes(color = as.factor(dt))) +
geom_hline(yintercept = 1.5, linetype = 2) +
coord_cartesian(ylim = c(0, 8)) +
labs(x = "", y = "", color = "Lag") +
scale_x_date(breaks = "months", date_labels = "%b") +
facet_wrap(~ geo_value, ncol = 3, scales = "free_x") +
theme(legend.position = "top")
We consider forecasting \(k\)-day ahead national deaths based on current cases and two different mechanisms, with respect to the CFR:
# Join CFR along with cases and future deaths
cfr_signals = inner_join(
inner_join(cfr %>% rename("cfr" = value),
deaths %>%
select(time_value, geo_value, value) %>%
rename("deaths" = value),
by = c("geo_value", "time_value")),
aggregate_signals(cases, dt = -(10:30), format = "long") %>%
select(geo_value, time_value, dt, value) %>%
mutate(dt = -dt) %>%
rename("cases-dt" = value),
by = c("geo_value", "time_value", "dt"))
# Consider the following global values for CFR
cfr_vals = c(1.4, 1.7, 2.1)
# Now make forecasts k days into the future, for each k in dt = 10:30, using:
# 1. the local CFR computed for that date and location, and
# 2. the global CFR values
cfr_all = cfr_signals %>% mutate(type = "local") %>%
rbind(map_dfr(cfr_vals, function(val) {
cfr_signals %>% mutate(cfr = val, type = paste("global:", val))
})) %>%
mutate(pred = cfr * `cases-dt` / 100)
plot_err = function(geo_value, start_day = NULL, end_day = NULL) {
if (is.null(start_day)) start_day = Min(cfr_all$time_value)
if (is.null(end_day)) end_day = Max(cfr_all$time_value)
start_day = as.Date(start_day); end_day = as.Date(end_day)
given_geo_value = geo_value
ggplot(cfr_all %>%
filter(geo_value == given_geo_value,
between(time_value, start_day, end_day)) %>%
group_by(dt, type) %>%
summarize(error = Mean(abs(deaths - pred))),
aes(x = dt, y = error)) +
geom_point(aes(color = type)) +
geom_line(aes(color = type)) +
labs(x = "Lag", y = "Mean absolute error (deaths)", color = "",
title = paste("Forecast errors,", toupper(geo_value)),
subtitle = sprintf("Over %s to %s",
format.Date(start_day, "%B %d"),
format.Date(end_day, "%B %d")))
}
plot_err(geo_value = "us", start_day = "2020-07-01")
We can see that, pretty much across the board (for any lag value), the global forecaster with CFR = 1.7% is the most accurate: easily more accurate than the other two global forecasters, and a little more accurate than the local one. It appears that the global CFR = 1.7% forecaster is most accurate at lag 16, that is, 16-day-ahead forecasts.
For simplicity, we now restrict our attention to this model: CFR = 1.7%. We look at forecasts 16, 22, and 28 days ahead (these are the lags for the CFR model). Below we plot the predictions along with actual deaths and we plot uncertainty bands based on quantiles of residuals.
plot_pred = function(geo_value, dt, type, start_day = NULL, end_day = NULL) {
if (is.null(start_day)) start_day = Min(cfr_all$time_value)
if (is.null(end_day)) end_day = Max(cfr_all$time_value)
start_day = as.Date(start_day); end_day = as.Date(end_day)
given_geo_value = geo_value; given_dt = dt; given_type = type
# Filter down to what we need
x = cfr_all %>%
filter(geo_value == given_geo_value,
type == given_type, dt == given_dt,
between(time_value, start_day, end_day))
# Compute quantiles of model residuals
probs = c(0.025, 0.975)
quant = quantile(x$deaths - x$pred, probs, na.rm = TRUE)
# Add predictions for current day
given_cfr = x %>% filter(time_value == max(time_value)) %>% pull(cfr)
x = x %>% bind_rows(
cases %>% filter(geo_value == given_geo_value,
time_value >= max(time_value) - given_dt) %>%
select(time_value, geo_value, value) %>%
mutate(dt = given_dt, cfr = given_cfr) %>%
rename("cases-dt" = value) %>%
mutate(pred = cfr * `cases-dt` / 100) %>%
mutate(time_value = time_value + dt))
colors = c("Observed" = "black", "Predicted" = "red")
ggplot(x, aes(x = time_value)) +
geom_ribbon(aes(ymin = pred + quant[1],
ymax = pred + quant[2],
fill = "Predicted"),
alpha = 0.2, show.legend = FALSE) +
geom_line(aes(y = deaths, color = "Observed")) +
geom_line(aes(y = pred, color = "Predicted")) +
scale_color_manual(values = colors) +
labs(x = "Date", y = "Deaths", color = "",
title = sprintf("%s-day-ahead forecast, %s", given_dt,
toupper(geo_value)),
subtitle = sprintf("Over %s to %s",
format.Date(start_day, "%B %d"),
format.Date(end_day, "%B %d")))
}
plot_pred(geo_value = "us", dt = 16, type = "global: 1.7",
start_day = "2020-07-01")
plot_pred(geo_value = "us", dt = 22, type = "global: 1.7",
start_day = "2020-07-01")
plot_pred(geo_value = "us", dt = 28, type = "global: 1.7",
start_day = "2020-07-01")